Result Showcase

In [1]:
# Necessary imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import IFrame, HTML
from wordcloud import WordCloud, STOPWORDS
In [2]:
# Read all necessary data
df = pd.read_csv('../data/complete_data.tsv', sep='\t')
data_stats_org = np.load('../data/stats/data_stats_org.npy', allow_pickle=True).item()
data_stats_resp = np.load('../data/stats/data_stats_resp.npy', allow_pickle=True).item()
data_stats_topic = np.load('../data/stats/data_stats_topic.npy', allow_pickle=True).item()
data_stats_sent = np.load('../data/stats/sent_stats.npy', allow_pickle=True).item()
data_stats_author = pd.read_csv('../data/stats/data_stats_author.tsv', sep='\t')
data_stats_total = pd.read_csv('../data/stats/data_stats_total.tsv', sep='\t')
data_nix_ken = pd.read_csv('../data/stats/data_nix_ken.tsv', sep='\t')
In [3]:
# General stats of all datasets
# Number of unique arguments, number of total pairs, number of attacks/supports und unrelated pairs
# Statistics about the total length (org+response) of the pairs
# Important: debate_test/train is already repaired, but still not the same as in the paper
# Important: agreement had many rows which could not get parsed, e.g. because resp or org was empty, they were excluded
# And the dataset is smaller than reported in the paper
# Important: There are two duplicates in the political dataset
# Length important for the seq_len parameter of BERT
data_stats_total.loc[data_stats_total['dataset'].isin(['debate_test', 'debate_train', 'procon', 'political', 'agreement'])]
Out[3]:
dataset args tot attack/disagreement support/agreement unrelated mean_total_len median_total_len max_total_len
0 agreement 7930 29244 15259 13985 0 191.008754 181.0 1198.0
8 debate_test 114 102 44 58 0 79.235294 81.0 200.0
9 debate_train 109 98 46 52 0 82.418367 70.0 264.0
10 political 1065 1462 378 353 731 215.902189 207.0 593.0
11 procon 66 60 30 30 0 56.533333 53.5 139.0
In [4]:
# Debate train/test by topic
# Topics that are not matching paper are Interentaccess and Militaryservice
# Most topics attack/support distributions are similar to the overall distribution
pd.concat((data_stats_topic['debate_train'], data_stats_topic['debate_test']), keys=['train', 'test'])
Out[4]:
topic args tot attack support mean_total_len median_total_len max_total_len
train 0 Arminglibianrebels 11 9 5 4 94.444444 80.0 157.0
1 Childbeautycontests 13 11 4 7 75.272727 62.0 145.0
2 Chinaonechildpolicy 11 10 4 6 104.000000 89.5 173.0
3 Cocanarcotic 15 14 7 7 80.714286 72.0 183.0
4 Internetaccess 13 12 6 6 115.250000 94.5 264.0
5 Osamaphoto 11 10 5 5 63.400000 63.0 145.0
6 Privatizingsocialsecurity 11 10 5 5 67.600000 57.5 130.0
7 Sobrietytest 8 7 3 4 104.428571 75.0 202.0
8 Violentgames 16 15 7 8 53.666667 50.0 120.0
9 NaN 109 98 46 52 758.776696 643.5 1519.0
test 0 Cellphones 11 10 5 5 84.100000 80.0 159.0
1 Gasvehicles 13 11 6 5 94.090909 86.0 133.0
2 Gaymarriage 7 6 2 4 94.333333 96.0 142.0
3 Groundzeromosque 10 8 5 3 101.500000 93.0 155.0
4 Marijuanafree 17 16 6 10 73.812500 69.0 163.0
5 Militaryservice 13 12 6 6 56.500000 60.5 103.0
6 Noflyzone 11 10 4 6 72.200000 76.5 126.0
7 Securityprofiling 9 8 4 4 90.250000 88.0 129.0
8 Solarenergy 16 15 4 11 64.066667 59.0 111.0
9 Vegetarianism 7 6 2 4 94.000000 96.5 200.0
10 NaN 114 102 44 58 824.853409 804.5 1421.0
In [5]:
# Political by topic
# Most topics have a similar distribution, minimum wage is an exception
data_stats_topic['political']
Out[5]:
topic args tot attack support unrelated mean_total_len median_total_len max_total_len
0 cuba 253 258 38 40 180 220.403101 216.0 531.0
1 disarmament 172 316 76 108 132 228.582278 209.0 562.0
2 medical care,health care 279 289 75 72 142 210.948097 212.0 461.0
3 minimum wage 199 312 125 80 107 209.535256 208.5 460.0
4 unemployment 211 287 64 53 170 209.804878 193.0 593.0
5 NaN 1114 1462 378 353 731 1079.273611 1038.5 2607.0
In [6]:
# Political by author
# Same author mostly support each other
# Different authors mostly attack each other
# Dataset is heavily imbalanced in respect to the author, Kennedy occurs way more often
print(data_nix_ken.groupby("author").nunique())
data_stats_author.style.background_gradient(cmap='Blues')
         author  text
author               
Kennedy       1   739
Nixon         1   326
Out[6]:
author_resp author_org args tot attack support unrelated mean_total_len median_total_len max_total_len
0 Kennedy Kennedy 496 467 2 195 270 200.514 188 588
1 Kennedy Nixon 501 419 172 55 192 232.993 221 593
2 Nixon Kennedy 493 461 202 42 217 210.126 208 531
3 Nixon Nixon 154 115 2 61 52 239.278 211 556
4 nan nan 1644 1462 378 353 731 882.911 828 2268
In [7]:
# Political duplicates
for data_set in ['political']:
    print(data_set + " Duplicates:")
    df_check = df[df['org_dataset'] == data_set]
    print(df_check[df_check.duplicated(subset=['org', 'response'], keep=False)])
political Duplicates:
     org_dataset    id                                                org  \
3813   political   628  We have gone into every conference unprepared ...   
4557   political  1599  I select it because I believe it is the most i...   
4642   political  1705  We have gone into every conference unprepared ...   
4725   political  1817  I select it because I believe it is the most i...   

     org_stance                                           response  \
3813    Kennedy  And you ask about prestige . What do our oppon...   
4557      Nixon  After 3 months , his report was dismissed and ...   
4642    Kennedy  And you ask about prestige . What do our oppon...   
4725      Nixon  After 3 months , his report was dismissed and ...   

     response_stance      label        topic  org_len  response_len  \
3813           Nixon     attack  disarmament       93           189   
4557         Kennedy  unrelated  disarmament      431            79   
4642           Nixon  unrelated  disarmament       93           189   
4725         Kennedy  unrelated  disarmament      431            79   

      complete_len  
3813           282  
4557           510  
4642           282  
4725           510  

Length

In [8]:
# Plot distribution of length of org, resp and combined over the different datasets
# Seq_len 128/200 ~75% of debate_dataset, 250 ~75% political_dataset
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(10,4))  # 1 row, 2 columns

for data_set, ax in [('debate_extended', ax1), ('political',ax2)]:
    df_plot = df[df['org_dataset'] == data_set]
    df_plot.boxplot(ax=ax)
    ax.set_title(data_set)
plt.tight_layout()

Attack/Support ratios

In [9]:
# Plot how many arguments attack an argument (attack-ratio)
# Most arguments are only attacked or only supported (interesting for detecting arguments likely to be attacked/supported)
# If we disregard every argument, which is only answered to once most arguments have an attack-ratio of 0.5
# In the case of the political dataset many arguments are unrelated, and unrelated arguments are disregarded in this plot
fig, (ax1,ax2) = plt.subplots(2,2, figsize=(10,4))  # 2 rows, 2 columns
for data_set, ax in [('debate_extended', ax1), ('political',ax2)]:
    df_plot = data_stats_org[data_set].iloc[:-1].apply(
        lambda r: pd.Series({"Attack-ratio": r.attacked / r.tot,
                             "Attack-ratio (exluding arguments only attacked/supported once)": np.nan if r.tot == 1 else r.attacked / r.tot}),
        axis=1)
    # Ratio broken?
    df_plot.hist(density=False, ax=ax)
    ax[0].set_ylabel(data_set, rotation=0)
    
plt.tight_layout()

Usage of arguments

In [10]:
# First column shows how many answers an argument has
# Second column shows how many outgoing links an argument has
# Most arguments only have one ingoing link, but some have many ~10 debate, ~30 political
# In debate (orginal) every argument only has one outgoing link, in political most have one, but some have many ~8
fig, (ax1,ax2,ax3) = plt.subplots(3,2, figsize=(10,4))  # 3 rows, 2 columns

for data_set, ax in [('debate_test', ax1), ('debate_extended', ax2), ('political',ax3)]:
    df_plot = data_stats_org[data_set].iloc[:-1]
    df_plot = df_plot['tot']
    # Ratio broken?
    df_plot.hist(density=True, ax=ax[0])
    ax[0].set_title('{0}, org'.format(data_set))
    ax[1].set_title('{0}, resp'.format(data_set))
    df_plot = data_stats_resp[data_set].iloc[:-1]
    df_plot = df_plot['tot']
    # Ratio broken?
    df_plot.hist(bins=np.arange(0, 10), ax=ax[1])
plt.tight_layout()

Visualizations Debate Responses

  • Word scattertext of the responses in debate_train
  • Lime and anchor visualization of an example sentence, using only_response (rest default options)
    • model has accuracy 53% (quite bad)
    • details about LIME here
    • details about ANCHOR here
In [11]:
# Scattertext of the responses in debate_train
# No special "attacking" or "supporting" words easily recognizable
# The words are either topic specific, e.g. China (in topic Chinaonechildpolicy there are more supports than attacks)
# Or they seem to be there by chance (small dataset), e.g. he, does
IFrame(src='./scattertext_attack_supportdebate_train.html', width=950, height=500)
Out[11]:
In [12]:
# Lime Visualization
# Some of the words play an influence as expected, e.g. are and not (attack), play, and alcohol (support)
# Others do not play the expected influence, e.g. china (attack and not support as expected)
# Overall, all weights are really small and the removal/replacement with UNK of a single word 
# does not change the prediction
HTML(filename='./lime.html')
Out[12]:
In [13]:
# Anchor Visualization
# Anchor did not find a way to change some words, and then to predict the other class
HTML(filename='./anchor.html')
Out[13]:

Visualizations Political Authors

  • WordClouds authors
  • Scattertext authors
  • Lime and Anchor, only_org (rest default), attack/support
    • Model acc: 70%, F1: 70%
In [14]:
# Wordclouds for kennedy and for nixon
# Both often say the name of the other candidate, Nixon talks about Predisdent Eisenhower
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(12,10))  # 1 row, 2 columns

stopwords = set(STOPWORDS)  # set(STOPWORDS)
wordcloud = WordCloud(
    stopwords=stopwords).generate(
    " ".join(text for text in data_nix_ken.loc[data_nix_ken["author"] == 'Nixon', 'text']))
ax1.imshow(wordcloud, interpolation="bilinear")
ax1.set_title("Nixon WordCloud")
ax1.set_axis_off()
wordcloud = WordCloud(
    stopwords=stopwords).generate(
    " ".join(text for text in data_nix_ken.loc[data_nix_ken["author"] == 'Kennedy', 'text']))
ax2.imshow(wordcloud, interpolation="bilinear")
ax2.set_title("Kennedy WordCloud")
ax2.set_axis_off()


plt.tight_layout()
In [15]:
# Scattertext
# Scattertext of the authors in political
# The word usage of Nixon and Kennedy is quite different
IFrame(src='./scattertext_nixon_kennedy.html', width=950, height=500)
Out[15]:
In [16]:
# Lime
# All words have a very small impact
HTML(filename='./lime_pol.html')
Out[16]:
In [17]:
# Anchors
# No rule found
HTML(filename='./anchor_pol.html')
Out[17]:

Baselines

  • TODO: for the grouped results, actually calculate the weighted average/baseline
In [18]:
# Major Class
def get_major_acc(x, classes=['unrelated', 'attack/disagreement', 'support/agreement']):
    return np.divide(x[classes].max(), np.sum(x[classes]))

def get_major_class(x, classes=['unrelated', 'attack/disagreement', 'support/agreement']):
    return x[classes].astype('float64').idxmax()

data_stats_total['major_acc'] = data_stats_total.apply(get_major_acc, axis=1)
data_stats_total['major_class'] = data_stats_total.apply(get_major_class, axis=1)

data_stats_total.loc[data_stats_total['dataset'].isin(['debate_test', 'political'])][['dataset', 'major_class', 'major_acc']]
/media/jannis/GeDaTS/envs/pytorch/lib/python3.7/site-packages/ipykernel_launcher.py:3: RuntimeWarning: invalid value encountered in true_divide
  This is separate from the ipykernel package so we can avoid doing imports until
Out[18]:
dataset major_class major_acc
8 debate_test support/agreement 0.568627
10 political unrelated 0.500000
In [19]:
# Major Class per Topic node
data = data_stats_topic['debate_test']
data['major_acc'] = data.apply(get_major_acc, args=[['attack','support']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['attack','support']], axis=1)
data[['topic', 'major_class', 'major_acc', 'tot']]
Out[19]:
topic major_class major_acc tot
0 Cellphones attack 0.500000 10
1 Gasvehicles attack 0.545455 11
2 Gaymarriage support 0.666667 6
3 Groundzeromosque attack 0.625000 8
4 Marijuanafree support 0.625000 16
5 Militaryservice attack 0.500000 12
6 Noflyzone support 0.600000 10
7 Securityprofiling attack 0.500000 8
8 Solarenergy support 0.733333 15
9 Vegetarianism support 0.666667 6
10 NaN support 0.568627 102
In [20]:
# Major Class per Topic political
data = data_stats_topic['political']
data['major_acc'] = data.apply(get_major_acc, args=[['attack','support', 'unrelated']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['attack','support', 'unrelated']], axis=1)
data[['topic', 'major_class', 'major_acc', 'tot']]
Out[20]:
topic major_class major_acc tot
0 cuba unrelated 0.697674 258
1 disarmament unrelated 0.417722 316
2 medical care,health care unrelated 0.491349 289
3 minimum wage attack 0.400641 312
4 unemployment unrelated 0.592334 287
5 NaN unrelated 0.500000 1462
In [21]:
# Major Class per Topic political attack/support only
data = data_stats_topic['political']
data['major_acc'] = data.apply(get_major_acc, args=[['attack','support']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['attack','support']], axis=1)
data[['topic', 'major_class', 'major_acc', 'tot']]
Out[21]:
topic major_class major_acc tot
0 cuba support 0.512821 258
1 disarmament support 0.586957 316
2 medical care,health care attack 0.510204 289
3 minimum wage attack 0.609756 312
4 unemployment attack 0.547009 287
5 NaN attack 0.517100 1462
In [22]:
# Major Class Author identified 
data = data_stats_author.copy()
data['major_acc'] = data.apply(get_major_acc, args=[['attack','support', 'unrelated']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['attack','support', 'unrelated']], axis=1)
data[['author_resp', 'author_org', 'major_class', 'major_acc', 'tot']]
Out[22]:
author_resp author_org major_class major_acc tot
0 Kennedy Kennedy unrelated 0.578158 467
1 Kennedy Nixon unrelated 0.458234 419
2 Nixon Kennedy unrelated 0.470716 461
3 Nixon Nixon support 0.530435 115
4 NaN NaN unrelated 0.500000 1462
In [23]:
# Major Class Author identified attack/support only
data = data_stats_author.copy()
data['major_acc'] = data.apply(get_major_acc, args=[['attack','support']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['attack','support']], axis=1)
data[['author_resp', 'author_org', 'major_class', 'major_acc', 'tot']]
Out[23]:
author_resp author_org major_class major_acc tot
0 Kennedy Kennedy support 0.989848 467
1 Kennedy Nixon attack 0.757709 419
2 Nixon Kennedy attack 0.827869 461
3 Nixon Nixon support 0.968254 115
4 NaN NaN attack 0.517100 1462
In [24]:
# Merged to same author / different author
# Very high accuracy possible if only detected if it is the same or a different author
data = data_stats_author.iloc[:-1].copy()
data['authors'] = data.apply(lambda r: 'Same' if r['author_resp'] == r['author_org'] else 'Different', axis=1)
data = data.groupby('authors').sum()
data = data.reset_index()
data['major_acc'] = data.apply(get_major_acc, args=[['attack','support']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['attack','support']], axis=1)

data[['authors', 'major_class', 'major_acc', 'tot']]
Out[24]:
authors major_class major_acc tot
0 Different attack 0.794055 880
1 Same support 0.984615 582
In [25]:
# Sentiment Analysis (nltk vader)

# Only responses debate test, supporting arguments often have a positive sentiment
# Attacking arguments have nothing special
pd.concat((data_stats_sent['respdebate_test'],data_stats_sent['resppolitical']), keys=['node', 'political'], sort=True)
Out[25]:
All attack support unrelated
discrete_polarity
node negative 46 23 23 NaN
neutral 6 3 3 NaN
positive 50 18 32 NaN
All 102 44 58 NaN
political negative 373 87 68 218.0
neutral 37 5 12 20.0
positive 1052 286 273 493.0
All 1462 378 353 731.0
In [26]:
# Both org and response
# Attack often have different sentiment, support often have the same sentiment (node)
# Nothing meaningful for political
pd.concat((data_stats_sent['bothdebate_test'],data_stats_sent['bothpolitical']), keys=['node', 'political'], sort=True)
Out[26]:
discrete_polarity_both Different Same
label
node attack 25 19
support 22 36
political attack 114 264
support 112 241
unrelated 310 421
In [27]:
# .... ?
# Major Class for every Org argument
# Major Class for every Resp argument (only political)

Results

NoDE paper

In [28]:
# Node Acc with different parameters

# Fixed: input=both, seq_len=128, warmup_prop=0.1, seed=42
# Tested: model=base-uncased,large-uncased, epochs=3,4,5, batch_size=8,12,16, lr=2e-5, 3e-5, 5e-5
# Gradient accumulation: batch_size/4 for bert_large 
# (in principle equivalent, in practice different because of rounding errors etc.)
eval_results = pd.read_csv('../pytorch/node_both/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# Paper acc 0.67, best bert acc 0.74, mean (bert-base) 0.62 , baselines ~0.6
print(eval_results['acc'].agg([np.mean, np.min, np.max, np.std]))
# Somehow bert-large performs worse than bert-base
print(eval_results.groupby('_bert-model')['acc'].agg([np.mean, np.min, np.max, np.std])) 
print()
# Print settings of best result
print(eval_results.iloc[eval_results['acc'].idxmax()])

# Show the table
eval_results.head()
mean    0.574437
amin    0.431373
amax    0.745098
std     0.097026
Name: acc, dtype: float64
                        mean      amin      amax       std
_bert-model                                               
bert-base-uncased   0.624546  0.431373  0.745098  0.097123
bert-large-uncased  0.524328  0.431373  0.607843  0.067397

_batch_size                      12
_bert-model       bert-base-uncased
_gradient_acc                     1
_input_mode                    both
_learning_rate                2e-05
_num_epochs                       5
_seed                            42
_seq_len                        128
_task                          node
_warmup                         0.1
acc                        0.745098
eval_loss                  0.720852
global_step                      70
loss                              0
Name: 51, dtype: object
Out[28]:
_batch_size _bert-model _gradient_acc _input_mode _learning_rate _num_epochs _seed _seq_len _task _warmup acc eval_loss global_step loss
0 4 bert-large-uncased 4 both 0.00002 3.0 42 128 node 0.1 0.568627 0.689357 30 0.0
1 4 bert-large-uncased 4 both 0.00003 3.0 42 128 node 0.1 0.568627 0.684217 30 0.0
2 4 bert-large-uncased 4 both 0.00005 3.0 42 128 node 0.1 0.431373 0.694603 30 0.0
3 4 bert-large-uncased 2 both 0.00002 3.0 42 128 node 0.1 0.568627 0.691577 60 0.0
4 4 bert-large-uncased 2 both 0.00003 3.0 42 128 node 0.1 0.568627 0.685088 60 0.0

Political Paper

In [29]:
# Political F1 CrossVal
# Fixed: input=both, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=5, batch_size=12, lr=2e-5

# Related/Unrelated
eval_results = pd.read_csv('../pytorch/pol_ru/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# Paper average F1 0.65, here average F1 0.68, baseline ?
print(eval_results['f1'].agg([np.mean, np.min, np.max, np.std]))
eval_results.head()
mean    0.679147
amin    0.602068
amax    0.714915
std     0.038944
Name: f1, dtype: float64
Out[29]:
_batch_size _bert-model _gradient_acc _input_mode _learning_rate _num_epochs _seed _seq_len _task _warmup acc eval_loss f1 global_step loss
0 12 bert-base-uncased 1 both 0.00002 5.0 42 256 political-ru 0.1 0.716216 0.973621 0.714915 550 NaN
1 12 bert-base-uncased 1 both 0.00002 5.0 42 256 political-ru 0.1 0.698630 0.763579 0.698630 1100 NaN
2 12 bert-base-uncased 1 both 0.00002 5.0 42 256 political-ru 0.1 0.691781 0.923330 0.691766 1650 NaN
3 12 bert-base-uncased 1 both 0.00002 5.0 42 256 political-ru 0.1 0.678082 1.267428 0.675510 2200 NaN
4 12 bert-base-uncased 1 both 0.00002 5.0 42 256 political-ru 0.1 0.705479 0.804381 0.705355 2750 NaN
In [30]:
# Political F1 CrossVal
# Fixed: input=both, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=5, batch_size=12, lr=2e-5

# Attack/Support
eval_results = pd.read_csv('../pytorch/pol_as/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# Paper average F1 0.82, here average F1 0.73, baselines (author) ~0.85
print(eval_results['f1'].agg([np.mean, np.min, np.max, np.std]))
eval_results.head()
mean    0.730625
amin    0.464750
amax    0.876620
std     0.117763
Name: f1, dtype: float64
Out[30]:
_batch_size _bert-model _gradient_acc _input_mode _learning_rate _num_epochs _seed _seq_len _task _warmup acc eval_loss f1 global_step loss
0 12 bert-base-uncased 1 both 0.00002 5.0 42 256 political-as 0.1 0.635135 1.267251 0.628390 275 NaN
1 12 bert-base-uncased 1 both 0.00002 5.0 42 256 political-as 0.1 0.716216 0.858144 0.710970 550 NaN
2 12 bert-base-uncased 1 both 0.00002 5.0 42 256 political-as 0.1 0.783784 0.826216 0.782833 825 NaN
3 12 bert-base-uncased 1 both 0.00002 5.0 42 256 political-as 0.1 0.876712 0.294728 0.876620 1100 NaN
4 12 bert-base-uncased 1 both 0.00002 5.0 42 256 political-as 0.1 0.849315 0.506954 0.849202 1375 NaN
In [31]:
# Political F1 CrossVal
# Fixed: input=both, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=5, batch_size=12, lr=2e-5

# Attack/Support/Unrelated
eval_results = pd.read_csv('../pytorch/pol_asu/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# Paper only reported precision 0.57, here average f1 0.60 
# Use some tricks to coope with class imbalance!
print(eval_results['f1'].agg([np.mean, np.min, np.max, np.std]))
eval_results.head()
mean    0.596784
amin    0.497657
amax    0.685075
std     0.061454
Name: f1, dtype: float64
Out[31]:
_batch_size _bert-model _gradient_acc _input_mode _learning_rate _num_epochs _seed _seq_len _task _warmup acc eval_loss f1 global_step loss
0 12 bert-base-uncased 1 both 0.00002 5.0 42 256 political-asu 0.1 0.513514 1.599411 0.497657 550 NaN
1 12 bert-base-uncased 1 both 0.00002 5.0 42 256 political-asu 0.1 0.666667 1.145631 0.658627 1100 NaN
2 12 bert-base-uncased 1 both 0.00002 5.0 42 256 political-asu 0.1 0.564626 1.191574 0.564747 1650 NaN
3 12 bert-base-uncased 1 both 0.00002 5.0 42 256 political-asu 0.1 0.678082 1.055638 0.675802 2200 NaN
4 12 bert-base-uncased 1 both 0.00002 5.0 42 256 political-asu 0.1 0.684932 1.061664 0.685075 2750 NaN

Agreement Paper

  • Accuracy 74%
In [32]:
# Agreement F1 CrossVal
# Comparison with Paper + Baselines
# Fixed: input=both, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=2, batch_size=12, lr=2e-5

# Agreement/Disagreement
eval_results = pd.read_csv('../pytorch/agreement/eval_results.tsv', sep='\t')
# Some stats: mean, min, max, std
# Paper average acc 0.74 , here average acc 0.61
# TODO: non cross_val version had acc ~0.97! Probably parameters are bad, 2 Epochs might not be enough 
# (try again with higher epochs number)
print(eval_results['acc'].agg([np.mean, np.min, np.max, np.std]))
eval_results.head()
mean    0.609800
amin    0.492818
amax    0.676923
std     0.057996
Name: acc, dtype: float64
Out[32]:
_batch_size _bert-model _gradient_acc _input_mode _learning_rate _num_epochs _seed _seq_len _task _warmup acc eval_loss f1 global_step loss
0 12 bert-base-uncased 1 both 0.00002 2.0 42 256 agreement 0.1 0.664274 1.192023 0.662413 4388 NaN
1 12 bert-base-uncased 1 both 0.00002 2.0 42 256 agreement 0.1 0.540513 2.735487 0.491917 8776 NaN
2 12 bert-base-uncased 1 both 0.00002 2.0 42 256 agreement 0.1 0.601026 2.020025 0.585966 13164 NaN
3 12 bert-base-uncased 1 both 0.00002 2.0 42 256 agreement 0.1 0.612650 1.766003 0.603526 17552 NaN
4 12 bert-base-uncased 1 both 0.00002 2.0 42 256 agreement 0.1 0.676923 1.355926 0.673433 21940 NaN

Results analyzed

In [33]:
# Import the train/test splits functions
import sys 
import os
# TODO: use a relative path or a module instead
sys.path.append(os.path.abspath("/media/jannis/GeDaTS/SS19/BA/Code_BA/code_relation_prediction/pytorch"))

from run_classifier_dataset_utils import processors

node_pro = processors['node']('both')
political_as_pro = processors['political-as']('both')
In [34]:
# Node results with respect to topic
_, node_test_df = node_pro.get_dev_examples('../data')
eval_preds = pd.read_csv('../pytorch/node_both/eval_preds.csv')

# Only predictions from bert-base
res = pd.concat([node_test_df.reset_index(drop=True), eval_preds.iloc[27:,:-1].transpose().reset_index(drop=True)], axis=1)
res = res.replace({0: 'attack', 1: 'support'})

# For now, only one run (run 51) used
# There are errors in every topic, no clear trend visible that some topics are better or worse
# More false classifications of attack than of support (support is the major class)
# Could, also look at several runs, or average, etc.
pd.crosstab(res['topic'], [res['label'],res[51]])
Out[34]:
label attack support
51 attack support attack support
topic
Cellphones 1 4 0 5
Gasvehicles 5 1 0 5
Gaymarriage 1 1 0 4
Groundzeromosque 2 3 0 3
Marijuanafree 2 4 2 8
Militaryservice 5 1 1 5
Noflyzone 1 3 1 5
Securityprofiling 2 2 0 4
Solarenergy 2 2 0 11
Vegetarianism 2 0 1 3
In [35]:
# Take the rounded mean prediction for all bert-base runs 
res['mean_round'] = eval_preds.iloc[27:,:-1].mean().round().values
res = res.replace({0: 'attack', 1: 'support'})
pd.crosstab(res['topic'], [res['label'],res['mean_round']])
Out[35]:
label attack support
mean_round attack support attack support
topic
Cellphones 1 4 1 4
Gasvehicles 5 1 1 4
Gaymarriage 1 1 0 4
Groundzeromosque 2 3 0 3
Marijuanafree 3 3 2 8
Militaryservice 4 2 2 4
Noflyzone 0 4 1 5
Securityprofiling 1 3 0 4
Solarenergy 2 2 0 11
Vegetarianism 2 0 2 2
In [36]:
# We can recreate all metrics from the available data
# E.g. classification reports or confusion matrices 
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_pred=eval_preds.iloc[51,:-1].replace({0: 'attack', 1: 'support'}), y_true=res['label']))

print(confusion_matrix(res['label'], eval_preds.iloc[51,:-1].replace({0: 'attack', 1: 'support'})))
              precision    recall  f1-score   support

      attack       0.82      0.52      0.64        44
     support       0.72      0.91      0.80        58

    accuracy                           0.75       102
   macro avg       0.77      0.72      0.72       102
weighted avg       0.76      0.75      0.73       102

[[23 21]
 [ 5 53]]
In [37]:
# Political results with respect to topic
splits_data = political_as_pro.get_splits('../data')

# Get the test data and the test predictions
pol_test_df = pd.concat(np.array(splits_data)[:,3])
eval_preds = pd.read_csv('../pytorch/pol_as/eval_preds.csv')


pol_test_df['preds'] =  eval_preds.iloc[:,:-1].stack().values
pol_test_df = pol_test_df.replace({0: 'attack', 1: 'support'})


pd.crosstab(pol_test_df['topic'], [pol_test_df['label'],pol_test_df['preds']])
Out[37]:
label attack support
preds attack support attack support
topic
cuba 26 12 16 24
disarmament 45 31 25 83
medical care,health care 49 26 33 39
minimum wage 122 3 18 62
unemployment 51 13 18 35
In [38]:
# Political results with respect to author
pd.crosstab(pol_test_df['preds'], [pol_test_df['org_stance'],pol_test_df['response_stance']])
Out[38]:
org_stance Kennedy Nixon
response_stance Kennedy Nixon Kennedy Nixon
preds
attack 40 187 155 21
support 157 57 72 42
In [39]:
# Complete results political (all folds "summed")
print(classification_report(y_pred=pol_test_df['preds'], y_true=pol_test_df['label']))

print(confusion_matrix(y_pred=pol_test_df['preds'], y_true=pol_test_df['label']))
              precision    recall  f1-score   support

      attack       0.73      0.78      0.75       378
     support       0.74      0.69      0.71       353

    accuracy                           0.73       731
   macro avg       0.73      0.73      0.73       731
weighted avg       0.73      0.73      0.73       731

[[293  85]
 [110 243]]
In [40]:
# Results with respect to same org, same resp (always gets the same label or not?)

# Same org
# One org does not always get the same prediction (but often)
pd.crosstab(res['org'], res[51])
Out[40]:
51 attack support
org
A no-fly zone over Libya has a significant impact on conflict. 1 6
Airport security profiling is an effective strategy. 1 6
Animal life is equivalent in value to human life. Mahatma Gandhi said "To my mind, the life of a lamb is no less precious than that of a human being". 2 0
Compared with their petroleum-powered counterparts, natural gas vehicles greatly reduce greenhouse gas emissions. The exhaust created from natural gas contains 70 percent less carbon monoxide, nearly 90 percent less nitrogen oxide and non-methane organic gas, and virtually no particulate matter. 1 1
Conscripts are never as good as professional soldiers. 1 1
Cordoba House is no act of tolerance, but of excess/arrogance. Building this structure on the edge of the battlefield created by radical Islamists is not a celebration of religious pluralism and mutual tolerance; it is a political statement of shocking arrogance and hypocrisy. 1 0
Even if marijuana's effects were isolated to the individual, there is room for the state to protect individuals from harming themselves. This is why it is illegal, in some places, not wear a seat belt. If marijuana's effects are seen as clearly harmful, the state can justly protect its citizens from it. 1 0
Gasoline vehicles can be converted to run on natural gas. This means that heavy-polluting vehicles can be transformed into much lower-emission vehicles. This is key, as the millions of gasoline vehicles on the road currently cannot be immediately removed from the road. 1 0
Gay marriage is a civil right. 1 5
Humans and animals are of equal value. 1 3
Individuals should be at liberty to use marijuana. 2 5
Individuals should be free to use marijuana. If an individual wants to harm themselves, they should be free to do so. 1 1
Mandatory service is a very cost-efficient defence solution. Many European countries who have abandoned military service have had lots of problems recruiting. 1 0
Many solar energy systems are now price competitive with coal. 0 4
Marijuana's public health costs violate tax-payer liberties. 0 1
Name "Cordoba House" is a very direct historical indication that the Ground Zero mosque is all about conquest and thus an assertion of Islamist triumphalism. 0 2
National conscription is important to national security. 3 5
Natural gas vehicles help cut emissions and fight global warming. 2 4
No fly zone in Libya prevents a massacre. 0 2
No fly zones have a history of limited effectiveness. 1 0
No justification for mandatory service where no threat exists. 1 0
On the surface, natural gas cars seem alright, but the topic becomes a bit different when these cars are competing against "zero emission" alternatives such as electric cars that are powered utilizing a solar grid. 1 1
Physically holding a handset removes one hand from the controls, making accidents more likely, while dialling is even worse, as it also requires the user to divert their attention away from the road. 0 1
Regulation could negate the safety benefits of having a phone in the car. When you're stuck in traffic, calling to say you'll be late can reduce stress and make you less inclined to drive aggressively to make up lost time. 1 0
Research shows that drivers speaking on a mobile phone have much slower reactions in braking tests than non-users, and are worse even than if they have been drinking. 0 1
Solar energy is abundant. Every minute, enough energy arrives at planet Earth to meet human energy demands for a year. It is, therefore, the most abundant energy source available to humans. This abundance makes it an economic gem. 2 0
Solar energy is economically sound. 0 9
The intentions of Cordoba House (also referred to as the "Ground Zero Mosque") developers are pure. 1 4
The use of cell-phones while driving is a public hazard. 0 7
Various risks to other citizens are greatly enhanced by marijuana use. 0 5
We have seen that certain types of people who fit a certain profile – young men of a particular ethnic background – have been engaged in terror activities, and targeting this sort of passenger would give people a greater sense of security. Profiling has to be backed by this type of statistical and intelligence-based evidence. There would be no point in stopping Muslim grandmothers. 1 0
In [41]:
# Same org pol
# TODO: aggregate to get some useful insights 
# (and maybe do it for every fold individually, 
# because otherwise it could be that we always predict one label for one org in one fold and another in another fold)
pd.crosstab(pol_test_df['org'], pol_test_df['preds']).head()
Out[41]:
preds attack support
org
But Mr. Nixon and the party he leads say they are content , just as they have always been content , in the face of poverty and unemployment and an America with its most urgent needs unmet . The Republican Party which Mr. Nixon leads today is the same Republican Party which for half a century has opposed every single progressive measure which the Democrats have designed to improve human welfare and reduce human misery - the party which fought against the New Deal and tried to block the Fair Deal - the party which , in the past 8 years , has vetoed aid to areas of unemployment , blocked efforts to improve unemployment compensation , opposed raising the minimum wage , refused to expand the distribution of surplus food to the hungry , and failed to offer one single program to increase the welfare of the American people . But Americans will not forget Mr. Nixon 's party this November - and they will return the Democrats to leadership so that all Americans can share in American abundance . 0 1
But Mr. Nixon and the party he leads say they are content , just as they have always been content , in the face of poverty and unemployment and an America with its most urgent needs unmet . The Republican Party which Mr. Nixon leads today is the same Republican Party which for half a century has opposed every single progressive measure which the Democrats have designed to improve human welfare and reduce human misery - the party which fought against the New Deal and tried to block the Fair Deal - the party which , in the past 8 years has voted aid to areas of unemployment , blocked efforts to improve unemployment compensation , opposed raising the minimum wage , refused to expand the distribution of surplus food to the hungry , and failed to offer one single program to increase the welfare of the American people . But Americans will not forget Mr. Nixon 's party this November - and they will return the Democrats to leadership so that all Americans can share in American abundance . 1 1
But his voting record as Congressman , Senator , and Vice President has been consistently antilabor . He says he supports the minimum wage - but as a Congressman he voted to eliminate from its protection a million workers already covered : and as Vice President he opposed our efforts to expand coverage and raise the minimum to $ 1.25 an hour . He makes campaign promises to help our distressed areas and our unemployed workers - but as Vice President he has consistently opposed our party 's efforts , sparked by Senator Douglas , to get such measures enacted . 1 0
I am sure that the threat of a White House veto mentioned by every speaker opposing the amendment contributed to this defeat . In addition , it should be noted that only one Republican voted for this sound approach to the medical care for the aged program . This vote demonstrates that if we 're going to have effective legislation in this and other fields , we 're going to have to have an administration that will provide leadership and a Congress that will act . 0 1
Is it true ? And the answer is , Of course it is n't true . The answer is - and now I 'm going to give you my version - the answer is that America , is moving forward , that unemployment went down more than usual last month and employment went up to record heights . The answer is that America will continue to move forward , continue to move forward , that is , unless we have some of our politicians talk us into a recession - and we are n't going to allow that to happen . 0 1
In [42]:
# Same resp pol
pd.crosstab(pol_test_df['response'], pol_test_df['preds']).head()
Out[42]:
preds attack support
response
But Mr. Nixon and the party he leads say they are content , just as they have always been content , in the face of poverty and unemployment and an America with its most urgent needs unmet . The Republican Party which Mr. Nixon leads today is the same Republican Party which for half a century has opposed every single progressive measure which the Democrats have designed to improve human welfare and reduce human misery - the party which fought against the New Deal and tried to block the Fair Deal - the party which , in the past 8 years , has vetoed aid to areas of unemployment , blocked efforts to improve unemployment compensation , opposed raising the minimum wage , refused to expand the distribution of surplus food to the hungry , and failed to offer one single program to increase the welfare of the American people . But Americans will not forget Mr. Nixon 's party this November - and they will return the Democrats to leadership so that all Americans can share in American abundance . 0 2
But Mr. Nixon and the party he leads say they are content , just as they have always been content , in the face of poverty and unemployment and an America with its most urgent needs unmet . The Republican Party which Mr. Nixon leads today is the same Republican Party which for half a century has opposed every single progressive measure which the Democrats have designed to improve human welfare and reduce human misery - the party which fought against the New Deal and tried to block the Fair Deal - the party which , in the past 8 years has voted aid to areas of unemployment , blocked efforts to improve unemployment compensation , opposed raising the minimum wage , refused to expand the distribution of surplus food to the hungry , and failed to offer one single program to increase the welfare of the American people . But Americans will not forget Mr. Nixon 's party this November - and they will return the Democrats to leadership so that all Americans can share in American abundance . 0 2
But his voting record as Congressman , Senator , and Vice President has been consistently antilabor . He says he supports the minimum wage - but as a Congressman he voted to eliminate from its protection a million workers already covered : and as Vice President he opposed our efforts to expand coverage and raise the minimum to $ 1.25 an hour . He makes campaign promises to help our distressed areas and our unemployed workers - but as Vice President he has consistently opposed our party 's efforts , sparked by Senator Douglas , to get such measures enacted . 0 1
But in the next breath he tells us that we ca n't afford to do those things in America that everybody knows we need to do . He says we ca n't afford to do what this great country should be doing in education , in programs for the aging , in medical care , in housing , in development of our natural resources . I say we can . 1 1
I am sure that the threat of a White House veto mentioned by every speaker opposing the amendment contributed to this defeat . In addition , it should be noted that only one Republican voted for this sound approach to the medical care for the aged program . This vote demonstrates that if we 're going to have effective legislation in this and other fields , we 're going to have to have an administration that will provide leadership and a Congress that will act . 1 0

Results other inputs

  • TODO: test with only the orgs as input and with only the response as input
    • Arguments likely to be attacked/supported
    • Attackful/ing or supportful/ing arguments
In [43]:
# Only org
In [44]:
# Only resp

Domain adaptation etc.

  • TODO: do some domain adaptation etc.
In [45]:
# Train on one dataset, evaluate on another (without finetuning)
In [46]:
# With finetuning (reusing the classification layer)
In [47]:
# With finetuning + use a new classification layer